{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from tools import *\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap 3 处理原始文本\n",
    "1.  如何访问文件内的文本？\n",
    "2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？\n",
    "3.  如何产生格式化的输出，并把结果保存在文件中？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 3.5. 正则表达式的有益应用"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.5.1 提取字符块"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >P109 提取元音字符块< ---------------\n['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u'] 长度=  16\n"
     ]
    }
   ],
   "source": [
    "show_subtitle(\"P109 提取元音字符块\")\n",
    "word = 'supercalifragilisticexpialidocious'\n",
    "word_pieces = re.findall(r'[aeiou]', word)\n",
    "print(word_pieces, \"长度= \", len(word_pieces))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >P109 提取两个元音字符块< ---------------\n",
      "['ea', 'oi', 'ea', 'ou', 'oi', 'ea', 'ea', 'oi', 'oi', 'ea', 'io', 'ea', 'ea']\n"
     ]
    }
   ],
   "source": [
    "show_subtitle(\"P109 提取两个元音字符块\")\n",
    "wsj = sorted(set(nltk.corpus.treebank.words()))\n",
    "word_pieces_list = [\n",
    "        vs\n",
    "        for word in wsj\n",
    "        for vs in re.findall(r'[aeiou]{2,}', word)\n",
    "]\n",
    "print(word_pieces_list[:13])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >统计双元音字符块的数目< ---------------\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[('io', 549),\n",
       " ('ea', 476),\n",
       " ('ie', 331),\n",
       " ('ou', 329),\n",
       " ('ai', 261),\n",
       " ('ia', 253),\n",
       " ('ee', 217),\n",
       " ('oo', 174),\n",
       " ('ua', 109),\n",
       " ('au', 106),\n",
       " ('ue', 105),\n",
       " ('ui', 95)]"
      ]
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "show_subtitle(\"统计双元音字符块的数目\")\n",
    "fd = nltk.FreqDist(word_pieces_list)\n",
    "fd.most_common(12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >提取日期格式中的整数值< ---------------\n[2009, 12, 31]\n"
     ]
    }
   ],
   "source": [
    "show_subtitle(\"提取日期格式中的整数值\")\n",
    "numbers_list = [\n",
    "        int(n)\n",
    "        for n in re.findall(r'[0-9]+', '2009-12-31')\n",
    "]\n",
    "print(numbers_list)"
   ]
  },
  {
   "source": [
    "### 3.5.2 在字符块上做更多的事情"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >P110 忽略掉单词内部的元音< ---------------\nenglish_tmp1=  ['Unvrsl', 'Dclrtn', 'of', 'Hmn', 'Rghts', 'Prmble', 'Whrs', 'rcgntn', 'of', 'the', 'inhrnt', 'dgnty', 'and']\nlen(english_tmp1)=  1781\nenglish_tmp2=  ['nvrsl', 'Dclrtn', 'f', 'Hmn', 'Rghts', 'Prmbl', 'Whrs', 'rcgntn', 'f', 'th', 'nhrnt', 'dgnty', 'nd']\nlen(english_tmp2)=  1781\nenglish_udhr[:75]=  Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and\nof the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn\nof frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn\nrghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,\nand the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and\n"
     ]
    }
   ],
   "source": [
    "# 使用findall()完成更加复杂的任务\n",
    "show_subtitle(\"P110 忽略掉单词内部的元音\")\n",
    "# 第一个模板保证元音在首字母和元音在尾字母的依然保留\n",
    "regexp1 = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'\n",
    "# 第二个模板会删除所有元音字母\n",
    "regexp2 = r'[^AEIOUaeiou]'\n",
    "\n",
    "\n",
    "def compress(word, regexp):\n",
    "    pieces = re.findall(regexp, word)\n",
    "    return ''.join(pieces)\n",
    "\n",
    "\n",
    "english_udhr = nltk.corpus.udhr.words('English-Latin1')\n",
    "\n",
    "english_tmp1 = [\n",
    "        compress(w, regexp1)\n",
    "        for w in english_udhr\n",
    "]\n",
    "print(\"english_tmp1= \", english_tmp1[:13])\n",
    "print(\"len(english_tmp1)= \", len(english_tmp1))\n",
    "\n",
    "english_tmp2 = [\n",
    "        compress(w, regexp2)\n",
    "        for w in english_udhr\n",
    "]\n",
    "print(\"english_tmp2= \", english_tmp2[:13])\n",
    "print(\"len(english_tmp2)= \", len(english_tmp2))\n",
    "\n",
    "print(\"english_udhr[:75]= \", nltk.tokenwrap(english_tmp1[:75]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >P111 提取辅音-元音序列对，并且统计单词库中这样的序列对的数目< ---------------\n",
      "    a   e   i   o   u \n",
      "k 418 148  94 420 173 \n",
      "p  83  31 105  34  51 \n",
      "r 187  63  84  89  79 \n",
      "s   0   0 100   2   1 \n",
      "t  47   8   0 148  37 \n",
      "v  93  27 105  48  49 \n"
     ]
    }
   ],
   "source": [
    "show_subtitle(\"P111 提取辅音-元音序列对，并且统计单词库中这样的序列对的数目\")\n",
    "rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')\n",
    "cvs = [\n",
    "        cv\n",
    "        for w in rotokas_words\n",
    "        for cv in re.findall(r'[ptksvr][aeiou]', w)\n",
    "]\n",
    "cfd = nltk.ConditionalFreqDist(cvs)\n",
    "cfd.tabulate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >定义「辅音-元音序列对」所对应的单词集合< ---------------\ncv_index['su']=  ['kasuari']\ncv_index['po']=  ['kaapo', 'kaapopato', 'kaipori', 'kaiporipie', 'kaiporivira', 'kapo', 'kapoa', 'kapokao', 'kapokapo', 'kapokapo', 'kapokapoa', 'kapokapoa', 'kapokapora', 'kapokapora', 'kapokaporo', 'kapokaporo', 'kapokari', 'kapokarito', 'kapokoa', 'kapoo', 'kapooto', 'kapoovira', 'kapopaa', 'kaporo', 'kaporo', 'kaporopa', 'kaporoto', 'kapoto', 'karokaropo', 'karopo', 'kepo', 'kepoi', 'keposi', 'kepoto']\n"
     ]
    }
   ],
   "source": [
    "show_subtitle(\"定义「辅音-元音序列对」所对应的单词集合\")\n",
    "cv_word_pairs = [\n",
    "        (cv, w)\n",
    "        for w in rotokas_words\n",
    "        for cv in re.findall(r'[ptksvr][aeiou]', w)\n",
    "]\n",
    "cv_index = nltk.Index(cv_word_pairs)\n",
    "print(\"cv_index['su']= \", cv_index['su'])\n",
    "print(\"cv_index['po']= \", cv_index['po'])"
   ]
  },
  {
   "source": [
    "### 3.5.3 查找词干"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['ing']\n"
     ]
    }
   ],
   "source": [
    "# 只取出词尾（只提取了后缀，没有提出词干）\n",
    "regexp = r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$'\n",
    "print(re.findall(regexp, 'processing'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['processing']\n[]\n"
     ]
    }
   ],
   "source": [
    "# 输出了整个单词（提取符合后缀的字符串，\"(?:)\"的作用，但是没有提取出词干）\n",
    "regexp = r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$'\n",
    "print(re.findall(regexp, 'processing'))  # 符合词缀要求的单词可以提取出来\n",
    "print(re.findall(regexp, 'processooo'))  # 不符合词缀要求的单词就不提取出来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[('process', 'ing')]\n[('processe', 's')]\n"
     ]
    }
   ],
   "source": [
    "# 将单词分解为词干和后缀\n",
    "regexp = r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$'\n",
    "print(re.findall(regexp, 'processing'))\n",
    "print(re.findall(regexp, 'processes'))  # 使用贪婪匹配模式，错误分解单词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[('process', 'es')]\n[('proces', 's')]\n[]\n"
     ]
    }
   ],
   "source": [
    "# 不使用贪婪匹配模式\n",
    "regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$'\n",
    "print(re.findall(regexp, 'processes'))\n",
    "print(re.findall(regexp, 'process'))  # 需要单词背景知识，将这类单词剔除，否则会错误地提取词干\n",
    "print(re.findall(regexp, 'language'))  # 没有单词背景知识时，如果对于没有词缀的单词会无法提取出单词来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[('language', '')]\n"
     ]
    }
   ],
   "source": [
    "# 正确处理没有后缀的单词\n",
    "regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'\n",
    "print(re.findall(regexp, 'language'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 更加准确地词干提取模板，先将原始数据分词，然后提取分词后的词干\n",
    "def stem(word):\n",
    "    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'\n",
    "    stem, suffix = re.findall(regexp, word)[0]\n",
    "    return stem"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'ly', 'in', 'pond', 'distribut', 'sword', 'i', 'no', 'basi', 'for', 'a', 'system', 'of', 'govern', '.', 'Supreme', 'execut', 'power', 'deriv', 'from', 'a', 'mandate', 'from', 'the', 'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']\n"
     ]
    }
   ],
   "source": [
    "raw = \"\"\"DENNIS: Listen,\n",
    "strange women lying in ponds distributing swords\n",
    "is no basis for a system of government.\n",
    "Supreme executive power derives from a mandate from the masses, \n",
    "not from some farcical aquatic ceremony.\"\"\"\n",
    "\n",
    "tokens = nltk.word_tokenize(raw)\n",
    "stem_list = [\n",
    "        stem(t)\n",
    "        for t in tokens\n",
    "]\n",
    "print(stem_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "DENNIS: Listen,\nstrange women lying in ponds distributing sword{s}\nis no basis for a system of government.\nSupreme executive power derives from a mandate from the masses, \nnot from some farcical aquatic ceremony.\n"
     ]
    }
   ],
   "source": [
    "# 正则表达式的展示函数nltk.re_show()，可以把符合正则表达式要求的字符标注出来\n",
    "# 不能使用re.findall()中的正则表达式标准。需要使用基本的正则表达式标准。\n",
    "regexp = r'[ing|ly|ed|ious|ies|ive|es|s|ment]$'\n",
    "nltk.re_show(regexp, raw)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "DENNIS: Listen,\nstrange women ly{ing} in ponds distribut{ing} swords\nis no basis for a system of government.\nSupreme executive power derives from a mandate from the masses, \nnot from some farcical aquatic ceremony.\n"
     ]
    }
   ],
   "source": [
    "regexp = r'(ing)'\n",
    "nltk.re_show(regexp, raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "DENNIS: L{i}ste{n},\nstra{n}{g}e wome{n} ly{i}{n}{g} {i}{n} po{n}ds d{i}str{i}but{i}{n}{g} swords\n{i}s {n}o bas{i}s for a system of {g}over{n}me{n}t.\nSupreme execut{i}ve power der{i}ves from a ma{n}date from the masses, \n{n}ot from some farc{i}cal aquat{i}c ceremo{n}y.\n"
     ]
    }
   ],
   "source": [
    "regexp = r'[ing]'\n",
    "nltk.re_show(regexp, raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "DENNIS: Listen,\nstrange women ly{ing} in ponds distribut{ing} swords\nis no basis for a system of government.\nSupreme executive power derives from a mandate from the masses, \nnot from some farcical aquatic ceremony.\n"
     ]
    }
   ],
   "source": [
    "regexp = r'ing'\n",
    "nltk.re_show(regexp, raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "{D}ENNIS: Listen,\n{s}trange women lying in ponds distributing swords\n{i}s no basis for a system of government.\n{S}upreme executive power derives from a mandate from the masses, \n{n}ot from some farcical aquatic ceremony.\n"
     ]
    }
   ],
   "source": [
    "regexp = '^[D|s|i|S|n]'\n",
    "nltk.re_show(regexp, raw)  # '^' 表示行的开头"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "{D}ENNIS: Listen,\n{s}trange women lying in ponds distributing swords\n{i}s no basis for a system of government.\n{S}upreme executive power derives from a mandate from the masses, \n{n}ot from some farcical aquatic ceremony.\n"
     ]
    }
   ],
   "source": [
    "regexp = '^[DsiSn]'\n",
    "nltk.re_show(regexp, raw)  # '[]' 内，用不用|都表示析取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "DENNIS: Listen{,}\nstrange women lying in ponds distributing sword{s}\nis no basis for a system of government{.}\nSupreme executive power derives from a mandate from the masses, \nnot from some farcical aquatic ceremony{.}\n"
     ]
    }
   ],
   "source": [
    "regexp = '[s|.|,]$'\n",
    "nltk.re_show(regexp, raw)  # '$' 表示行的结尾"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "DENNIS: Listen,\nstrange women ly{ing} in ponds distribut{ing} swords\nis no basis for a system of government.\nSupreme execu{tive} power derives from a mandate from the masses, \nnot from some farcical aquatic ceremony.\n"
     ]
    }
   ],
   "source": [
    "regexp = 'ing|tive'\n",
    "nltk.re_show(regexp, raw)  # '|' 表示析取指定的字符串"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "DENNIS: Li{s}ten,\n{s}trange women lying in pond{s} di{s}tributing {s}word{s}\ni{s} no ba{s}i{s} for a {s}y{s}tem of government.\nSupreme executive power derive{s} from a mandate from the ma{ss}e{s}, \nnot from {s}ome farcical aquatic ceremony.\n"
     ]
    }
   ],
   "source": [
    "regexp = '(s){1,2}'\n",
    "nltk.re_show(regexp, raw)  # '{}' 表示重复的次数"
   ]
  },
  {
   "source": [
    "### 3.5.4 搜索已经分词的文本"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']', 'ETYMOLOGY', '.', '(', 'Supplied', 'by']\n"
     ]
    }
   ],
   "source": [
    "# P114 对已经实现分词的文本（Text）进行搜索（findall）\n",
    "from nltk.corpus import gutenberg, nps_chat\n",
    "\n",
    "moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))\n",
    "tokens = moby.tokens\n",
    "print(tokens[:13])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "a monied man; a nervous man; a dangerous man; a white man; a white\nman; a white man; a pious man; a queer man; a good man; a mature man;\na white man; a Cape man; a great man; a wise man; a wise man; a\nbutterless man; a white man; a fiendish man; a pale man; a furious\nman; a better man; a certain man; a complete man; a dismasted man; a\nyounger man; a brave man; a brave man; a brave man; a brave man\n"
     ]
    }
   ],
   "source": [
    "# ToDo: ?: 是做什么用的？\n",
    "regexp = r\"(?:<a> <.*> <man>)\"\n",
    "moby.findall(regexp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "a monied man; a nervous man; a dangerous man; a white man; a white\nman; a white man; a pious man; a queer man; a good man; a mature man;\na white man; a Cape man; a great man; a wise man; a wise man; a\nbutterless man; a white man; a fiendish man; a pale man; a furious\nman; a better man; a certain man; a complete man; a dismasted man; a\nyounger man; a brave man; a brave man; a brave man; a brave man\n"
     ]
    }
   ],
   "source": [
    "regexp = r\"(<a> <.*> <man>)\"\n",
    "moby.findall(regexp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "monied; nervous; dangerous; white; white; white; pious; queer; good;\nmature; white; Cape; great; wise; wise; butterless; white; fiendish;\npale; furious; better; certain; complete; dismasted; younger; brave;\nbrave; brave; brave\n"
     ]
    }
   ],
   "source": [
    "# 找出文本中\"a <word> man\"中的word\n",
    "regexp = r\"<a>(<.*>)<man>\"\n",
    "moby.findall(regexp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[ Moby Dick by Herman Melville 1851 ] ETYMOLOGY . ( Suppli{ed} by a Late Consumptive Usher to a Grammar School ) The pale Usher -- threadbare in coat , heart , body , and brain ; I see him now . He was ever dust{ing} his old lexicons and grammars , with a queer handkerchief , mock{ing}{ly} embellish{ed} with all the gay flags of all the known nations of the world . He lov{ed}\n"
     ]
    }
   ],
   "source": [
    "regexp = 'ly|ed|ing'\n",
    "nltk.re_show(regexp, ' '.join(tokens[:75]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "[ Moby Dick by Herman Melville 1851 ] ETYMOLOGY . ( Supplied by a Late Consumptive Usher to a Grammar School ) The pale Usher -- threadbare in coat , heart , body , and brain ; I {see him now} . He was ever dusting his old lexicons and grammars , with a queer handkerchief , mockingly embellished with all the gay flags of all the known nations of the world . He loved to dust his old grammars ; it somehow mildly reminded him of his mortality . \" While you take in hand to school others , and to teach them by what name a whale - fish is to be called in our tongue leaving out , through ignorance , the letter H , which almost alone maketh the signification of the word , you deliver that which is not true .\" -- HACKLUYT \" WHALE . ... Sw . and Dan . HVAL . This animal is named from roundness or rolling ; for in Dan . HVALT is arched or vaulted .\" -- WEBSTER ' S DICTIONARY \" WHALE . ... It is more immediately from the Dut . and Ger . WALLEN ;\n"
     ]
    }
   ],
   "source": [
    "regexp = 'see [a-z]+ now'\n",
    "nltk.re_show(regexp, ' '.join(tokens[:200]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['now', 'im', 'left', 'with', 'this', 'gay', 'name', ':P', 'PART', 'hey', 'everyone', 'ah', 'well']\n"
     ]
    }
   ],
   "source": [
    "chat = nltk.Text(nps_chat.words())\n",
    "tokens = chat.tokens\n",
    "print(tokens[:13])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "you rule bro; telling you bro; u twizted bro\n"
     ]
    }
   ],
   "source": [
    "regexp = r\"<.*><.*><bro>\"\n",
    "chat.findall(regexp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la\nla la; lovely lol lol love; lol lol lol.; la la la; la la la\n"
     ]
    }
   ],
   "source": [
    "regexp = r\"<l.*>{3,}\"\n",
    "chat.findall(regexp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "now im {left with this gay name :P PART hey everyone ah well NICK : U7 U7 is a gay name . . ACTION gives U121 a golf clap . :) JOIN hi U59 26 / m / ky women that are nice please pm me JOIN PART there ya go U7 do n't golf clap me . fuck you U121 :@ whats everyone up to ? PART PART i 'll thunder clap your ass . PART and i dont even know what that means . that sounds painful any ladis wanna chat ? 29 m 26 / m JOIN my cousin drew a messed up pic on my cast PART 24 / m boo . 26 / m and sexy lol U115 boo . JOIN PART he drew a girl with legs spread boo . hope he didnt draw a penis PART ewwwww lol & a head between her legs JOIN JOIN sounds good to me . r u serious JOIN PART I 'll take one , please . & i have to go to the docs tomorrow ya man I am too .. Connected to ... Slip away ... Fade away ... Days away I ... Still feel}\n"
     ]
    }
   ],
   "source": [
    "regexp = 'l.+'\n",
    "nltk.re_show(regexp, ' '.join(tokens[:200]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "you ... Touc{hing me ... Changing me ... Considerably killing me ... heeeey ! do n't you have a sharpie ? 26 / m you 're back U115 U129 yep U115 Any ladies wanna chat with 24 / m hurry ladies PART JOIN JOIN not fast enough U116 a bowl i got a blunt an a bong ...... lol JOIN well , glad it worked out my chair is too hard . Anyone from Tennessee in here ? hey ladies as am i is U68 back yet PART hey PART JOIN U121 is missing a B in her name and i do n't complain about things being hard very often . ok yes U30 fire it up Any women from Nashville in here ? JOIN PART and an an \" itch \" JOIN yo , U133 or a \" ogan \" are you a male ? JOIN JOIN show will let 's talk . PART :) haha brb opps JOIN PART sho * . ACTION keeps U115 s place nice and warm . hey any guys with cams wanna play ? . ACTION sits on U68 's lap . JOIN JOIN any guyz wanna chat hi there boo , it}\n"
     ]
    }
   ],
   "source": [
    "regexp = 'h.+'\n",
    "nltk.re_show(regexp, ' '.join(tokens[200:400]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 正则表达式的测试界面\n",
    "nltk.app.nemo()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "['Too', 'often', 'a', 'beginning', 'bodybuilder', 'has', 'to', 'do', 'his', 'training', 'secretly', 'either', 'because']\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import brown\n",
    "\n",
    "hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))\n",
    "print(hobbies_learned[:13])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "speed and other activities; water and other liquids; tomb and other\nlandmarks; Statues and other monuments; pearls and other jewels;\ncharts and other items; roads and other features; figures and other\nobjects; military and other areas; demands and other factors;\nabstracts and other compilations; iron and other metals\n"
     ]
    }
   ],
   "source": [
    "regexp = r\"<\\w*> <and> <other> <\\w*s>\"\n",
    "hobbies_learned.findall(regexp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "as accurately as possible; as well as the; as faithfully as possible;\nas much as what; as neat as a; as simple as you; as well as other; as\nwell as other; as involved as determining; as well as other; as\nimportant as another; as accurately as possible; as accurate as any;\nas much as any; as different as a; as Orphic as that; as coppery as\nDelawares; as good as another; as large as small; as well as ease; as\nwell as their; as well as possible; as straight as possible; as well\nas nailed; as smoothly as the; as soon as a; as well as injuries; as\nwell as many; as well as reason; as well as in; as well as of; as well\nas a; as well as summer; as well as providing; as important as\ncooling; as evenly as it; as much as shading; as well as some; as well\nas subsoil; as high as possible; as well as many; as general as\nelectrical; as long as the; as well as the; as much as was; as well as\nset; as well as by; as high as 15; as well as aid; as much as\npossible; as well as personalities; as low as a; as well as the; as\nmuch as glass; as popular as renting; as expensive as most; as well as\nrelative; as well as by; as well as the; as far as possible; as far as\nradiation; as well as theoretical; as well as nuclear; as small as\npossible; as well as soap; as effective as the; as much as\napproximately; as well as information; as little as one; as much as\nan; as low as Af; as long as the; as far as possible; as well as\ntheir; as well as Hand; as well as all; as well as fractionation; as\npotent as the; as well as fever; as large as 3; as well as varying; as\nwell as the; as long as 2; as far as emotional; as well as the; as\nwell as regarding; as well as enthusiasm; as well as by; as well as\nher; as well as a; as old as social; as well as the; as well as the;\nas well as in; as much as they; as much as possible; as well as the;\nas well as some; as simple as one; as well as the; as well as in; as\ndefinable as possible; as long as they; as well as their; as well as\nforecasting; as soon as possible; as inevitable as anything; as well\nas for; as well as for; as nebulous as the; as awkward as the; as well\nas the; as well as by; as well as those; as well as the; as well as\nan; as well as with; as well as the; as well as moral; as much as\ntheir; as well as that; as likely as not; as well as upon; as well as\non; as well as upon; as long as all; as far as one; as long as the; as\nempty as the; as well as the; as well as the; as soon as they; as well\nas office; as speedily as possible; as well as of; as well as start;\nas well as behind; as much as for; as effectively as they; as\nimportant as it; as nearly as feasible; as well as form; as well as\naesthetic; as well as ethical; as well as Impressionism; as well as\nthe; as broad as the; as much as he; as arresting as a; as odd as the;\nas well as the; as soon as possible; as long as it; as impassive as\nPersian; as long as those; as importantly as his; as well as\nproviding; as well as the; as well as vertically; as well as new; as\nwell as certain; as well as the; as close as possible; as far as\nobtainable; as well as the; as important as the; as long as the; as\nsatisfactory as those\n"
     ]
    }
   ],
   "source": [
    "regexp=r\"<as><\\w*><as><\\w*>\"\n",
    "hobbies_learned.findall(regexp)"
   ]
  }
 ]
}